<!DOCTYPE html>
<html lang="zh-CN">





<head>
  <meta charset="UTF-8">
  <link rel="apple-touch-icon" sizes="76x76" href="/img/favicon.png">
  <link rel="icon" type="image/png" href="/img/favicon.png">
  <meta name="viewport"
        content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no, shrink-to-fit=no">
  <meta http-equiv="x-ua-compatible" content="ie=edge">
  
  <meta name="theme-color" content="#2f4154">
  <meta name="description" content="熊儿老猫的博客">
  <meta name="author" content="Zhenglin Li">
  <meta name="keywords" content="">
  <title>Python爬虫入门（一） - 熊儿老猫的博客</title>

  <link  rel="stylesheet" href="https://cdn.staticfile.org/twitter-bootstrap/4.4.1/css/bootstrap.min.css" />


  <link  rel="stylesheet" href="https://cdn.staticfile.org/github-markdown-css/4.0.0/github-markdown.min.css" />
  <link  rel="stylesheet" href="/lib/hint/hint.min.css" />

  
    <link  rel="stylesheet" href="https://cdn.staticfile.org/highlight.js/10.0.0/styles/github-gist.min.css" />
  

  


<!-- 主题依赖的图标库，不要自行修改 -->

<link rel="stylesheet" href="//at.alicdn.com/t/font_1749284_yg9cfy8wd6.css">



<link rel="stylesheet" href="//at.alicdn.com/t/font_1736178_pjno9b9zyxs.css">


<link  rel="stylesheet" href="/css/main.css" />

<!-- 自定义样式保持在最底部 -->


  <script  src="/js/utils.js" ></script>
<meta name="generator" content="Hexo 4.2.1"></head>


<body>
  <header style="height: 70vh;">
    <nav id="navbar" class="navbar fixed-top  navbar-expand-lg navbar-dark scrolling-navbar">
  <div class="container">
    <a class="navbar-brand"
       href="/">&nbsp;<strong>熊儿老猫的Blog</strong>&nbsp;</a>

    <button id="navbar-toggler-btn" class="navbar-toggler" type="button" data-toggle="collapse"
            data-target="#navbarSupportedContent"
            aria-controls="navbarSupportedContent" aria-expanded="false" aria-label="Toggle navigation">
      <div class="animated-icon"><span></span><span></span><span></span></div>
    </button>

    <!-- Collapsible content -->
    <div class="collapse navbar-collapse" id="navbarSupportedContent">
      <ul class="navbar-nav ml-auto text-center">
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/">
                <i class="iconfont icon-home-fill"></i>
                首页
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/archives/">
                <i class="iconfont icon-archive-fill"></i>
                归档
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/categories/">
                <i class="iconfont icon-category-fill"></i>
                分类
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/about/">
                <i class="iconfont icon-user-fill"></i>
                关于
              </a>
            </li>
          
        
          
          
          
          
            <li class="nav-item">
              <a class="nav-link" href="/links/">
                <i class="iconfont icon-link-fill"></i>
                友链
              </a>
            </li>
          
        
        
          <li class="nav-item" id="search-btn">
            <a class="nav-link" data-toggle="modal" data-target="#modalSearch">&nbsp;&nbsp;<i
                class="iconfont icon-search"></i>&nbsp;&nbsp;</a>
          </li>
        
      </ul>
    </div>
  </div>
</nav>

    <div class="view intro-2" id="background" parallax=true
         style="background: url('https://farm1.staticflickr.com/700/21371649841_2d54662091_z.jpg') no-repeat center center;
           background-size: cover;">
      <div class="full-bg-img">
        <div class="mask flex-center" style="background-color: rgba(0, 0, 0, 0.3)">
          <div class="container text-center white-text fadeInUp">
            <span class="h2" id="subtitle">
              
            </span>

            
              
  <div class="mt-3 post-meta">
    <i class="iconfont icon-date-fill" aria-hidden="true"></i>
    <time datetime="2020-07-20 21:41">
      2020年7月20日 晚上
    </time>
  </div>


<div class="mt-1">
  
    
    <span class="post-meta mr-2">
      <i class="iconfont icon-chart"></i>
      758 字
    </span>
  

  
    
    <span class="post-meta mr-2">
      <i class="iconfont icon-clock-fill"></i>
      
      
      9
       分钟
    </span>
  

  
  
    
      <!-- 不蒜子统计文章PV -->
      <span id="busuanzi_container_page_pv" style="display: none">
        <i class="iconfont icon-eye" aria-hidden="true"></i>
        <span id="busuanzi_value_page_pv"></span> 次
      </span>
    
  
</div>

            
          </div>

          
        </div>
      </div>
    </div>
  </header>

  <main>
    
      

<div class="container-fluid">
  <div class="row">
    <div class="d-none d-lg-block col-lg-2"></div>
    <div class="col-lg-8 nopadding-md">
      <div class="container nopadding-md" id="board-ctn">
        <div class="py-5" id="board">
          <div class="post-content mx-auto" id="post">
            
              <p class="note note-info">
                
                  本文最后更新于：15 天前
                
              </p>
            
            <article class="markdown-body">
              <h1 id="总述"><a href="#总述" class="headerlink" title="总述"></a>总述</h1><p>本来早就想学习下python爬虫了，总是找各种借口，一直拖到现在才开始系统的学习。</p>
<p>我用的教程是中国大学MOOC上的由北京理工大学开设的Python网络爬虫与信息提取。</p>
<p>废话不多说，直接开始。</p>
<h1 id="1-网络爬虫之规则"><a href="#1-网络爬虫之规则" class="headerlink" title="1. 网络爬虫之规则"></a>1. 网络爬虫之规则</h1><h2 id="1-1-requests库入门"><a href="#1-1-requests库入门" class="headerlink" title="1.1 requests库入门"></a>1.1 requests库入门</h2><h3 id="1-1-1-requests库简介"><a href="#1-1-1-requests库简介" class="headerlink" title="1.1.1 requests库简介"></a>1.1.1 requests库简介</h3><ol>
<li><p>Requests 是唯一的一个非转基因的 Python HTTP 库，人类可以安全享用。</p>
</li>
<li><p>安装     <code>pip install requests</code></p>
</li>
<li><p>引入    <code>import requests</code></p>
</li>
<li><p>7种常用方法</p>
<p>  <code>requests.request()</code> 构造一个请求，最基础的方法<br>  <code>requests.get()</code>     获得HTML网页的主要方法，对应于HTTP的GET<br>  <code>requests.head()</code>    获得HTML网页头的主要方法，对应于HTTP的HEAD<br>  <code>requests.post()</code>    提交post请求，POST<br>  <code>requests.put()</code>     提交put请求，PUT<br>  <code>requests.patch()</code>   提交局部修改请求,PATCH<br>  <code>requests.delete()</code>  删除请求，DELETE</p>
</li>
</ol>
<ol start="5">
<li><p>requests库的get()方法</p>
<p><code>r = requests.get(url)</code><br>该语句的意为构造一个向服务器请求资源的Request对象，返回的是一个包含服务器资源的Response对象，用r来接收。</p>
</li>
<li><p>Response对象的属性</p>
<p><code>r.ststus_code</code>           http请求的返回状态，200为成功<br><code>r.text</code>                  http响应内容的字符串形式，url对应的页面内容<br><code>r.encoding</code>              http header中猜测的编码格式<br><code>r.apparent_encoding</code>     http内容中猜测的响应内容的编码格式，备用<br><code>r.content</code>               http响应内容的二进制形式</p>
<h3 id="1-1-2-通用代码框架"><a href="#1-1-2-通用代码框架" class="headerlink" title="1.1.2 通用代码框架"></a>1.1.2 通用代码框架</h3></li>
<li><p>Requests库的异常</p>
<p><code>requests.ConnectionError</code>   网络连接异常<br><code>requests.HTTPError</code>         http错误异常<br><code>requesrs.URLRequired</code>       URL缺失异常<br><code>requests,ToomanyRedirects</code>  重定向异常，超过最大重定向次数<br><code>requests.ConnectTimeOut</code>    连接远程服务器超时异常<br><code>requests,Time</code>              请求URL超时，超时异常（包括上述的整个请求过程）</p>
</li>
<li><p>通用代码框架</p>
</li>
</ol>
<pre><code class="hljs python"><span class="hljs-keyword">import</span> requests


<span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">getHTMLText</span><span class="hljs-params">(url)</span>:</span>
    <span class="hljs-keyword">try</span>:
        r = requests.get(url, timeout=<span class="hljs-number">30</span>)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        <span class="hljs-keyword">return</span> r.text
    <span class="hljs-keyword">except</span>:
        <span class="hljs-keyword">return</span> <span class="hljs-string">'超时异常'</span>


<span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">'__main__'</span>:
    url = <span class="hljs-string">"https://www.baidu.com/"</span>
    print(getHTMLText(url))</code></pre>

<h3 id="HTTP协议及Requests库方法"><a href="#HTTP协议及Requests库方法" class="headerlink" title="HTTP协议及Requests库方法"></a>HTTP协议及Requests库方法</h3><ol>
<li><p>HTTP协议</p>
<p>HTTP协议, Hypertext Transfer Protocol, 超文本传输协议。<br>是一个基于“请求与响应”模式的、无状态的应用层协议。<br>采用URL作为定位网络资源的标识。</p>
</li>
<li><p>URL</p>
<p>URL是通过HTTP协议存取资源的Internet路径，一个URL对应一个数据资源。<br>格式为：<a href="http://host[:port][path]">http://host[:port][path]</a><br>host: 合法的Internet主机域名或IP地址<br>port: 端口号，省略端口为80<br>path: 请求资源的路径</p>
</li>
<li><p>HTTP协议对资源的操作</p>
</li>
</ol>
<p>GET：发送一个请求来取得服务器上的某一资源。</p>
<p>HEAD：HEAD和GET是一样的，区别在于HEAD不含有呈现数据，而仅仅是HTTP头信息。</p>
<p>POST：向服务器提交数据。这个方法用途广泛，几乎目前所有的提交操作都是靠这个完成。</p>
<p>PUT：这个方法比较少见。HTML表单也不支持这个。本质上来讲， PUT和POST极为相似，都是向服务器发送数据。</p>
<p>PATCH：向服务器更新数据。</p>
<p>DELETE：删除某一个资源。</p>
<ol start="4">
<li>HTTP协议方法与requests库方法一致。</li>
</ol>

            </article>
            <hr>
            <div>
              <div class="post-metas mb-3">
                
                
                  <div class="post-meta">
                    <i class="iconfont icon-tags"></i>
                    
                      <a class="hover-with-bg" href="/tags/python%E7%88%AC%E8%99%AB/">python爬虫</a>
                    
                  </div>
                
              </div>
              
              
                <div class="post-prevnext row">
                  <div class="post-prev col-6">
                    
                    
                      <a href="/2020/07/29/%E5%B0%8F%E9%95%87%E5%81%9A%E9%A2%98%E5%AE%B6/">
                        <i class="iconfont icon-arrowleft"></i>
                        <span class="hidden-mobile">小镇做题家</span>
                        <span class="visible-mobile">上一篇</span>
                      </a>
                    
                  </div>
                  <div class="post-next col-6">
                    
                    
                      <a href="/2020/07/19/mysql%E5%AE%89%E8%A3%85/">
                        <span class="hidden-mobile">mysql安装</span>
                        <span class="visible-mobile">下一篇</span>
                        <i class="iconfont icon-arrowright"></i>
                      </a>
                    
                  </div>
                </div>
              
            </div>

            
          </div>
        </div>
      </div>
    </div>
    
      <div class="d-none d-lg-block col-lg-2 toc-container" id="toc-ctn">
        <div id="toc">
  <p class="toc-header"><i class="iconfont icon-list"></i>&nbsp;目录</p>
  <div id="tocbot"></div>
</div>

      </div>
    
  </div>
</div>

<!-- Custom -->


    
  </main>

  
    <a id="scroll-top-button" href="#" role="button">
      <i class="iconfont icon-arrowup" aria-hidden="true"></i>
    </a>
  

  
    <div class="modal fade" id="modalSearch" tabindex="-1" role="dialog" aria-labelledby="ModalLabel"
     aria-hidden="true">
  <div class="modal-dialog modal-dialog-scrollable modal-lg" role="document">
    <div class="modal-content">
      <div class="modal-header text-center">
        <h4 class="modal-title w-100 font-weight-bold">搜索</h4>
        <button type="button" id="local-search-close" class="close" data-dismiss="modal" aria-label="Close">
          <span aria-hidden="true">&times;</span>
        </button>
      </div>
      <div class="modal-body mx-3">
        <div class="md-form mb-5">
          <input type="text" id="local-search-input" class="form-control validate">
          <label data-error="x" data-success="v"
                 for="local-search-input">关键词</label>
        </div>
        <div class="list-group" id="local-search-result"></div>
      </div>
    </div>
  </div>
</div>
  

  

  

  <footer class="mt-5">
  <div class="text-center py-3">
    <div>
      <a href="https://hexo.io" target="_blank" rel="nofollow noopener"><span>Hexo</span></a>
      <i class="iconfont icon-love"></i>
      <a href="https://github.com/fluid-dev/hexo-theme-fluid" target="_blank" rel="nofollow noopener">
        <span>Fluid</span></a>
    </div>
    

    

    
  </div>
</footer>

<!-- SCRIPTS -->
<script  src="https://cdn.staticfile.org/jquery/3.4.1/jquery.min.js" ></script>
<script  src="https://cdn.staticfile.org/twitter-bootstrap/4.4.1/js/bootstrap.min.js" ></script>
<script  src="/js/debouncer.js" ></script>
<script  src="/js/main.js" ></script>

<!-- Plugins -->


  
    <script  src="/js/lazyload.js" ></script>
  



  <script defer src="https://cdn.staticfile.org/clipboard.js/2.0.6/clipboard.min.js" ></script>
  <script  src="/js/clipboard-use.js" ></script>



  <script defer src="https://busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js" ></script>





  <script  src="https://cdn.staticfile.org/tocbot/4.11.1/tocbot.min.js" ></script>
  <script>
    $(document).ready(function () {
      var boardCtn = $('#board-ctn');
      var boardTop = boardCtn.offset().top;

      tocbot.init({
        tocSelector: '#tocbot',
        contentSelector: 'article.markdown-body',
        headingSelector: 'h1,h2,h3,h4,h5,h6',
        linkClass: 'tocbot-link',
        activeLinkClass: 'tocbot-active-link',
        listClass: 'tocbot-list',
        isCollapsedClass: 'tocbot-is-collapsed',
        collapsibleClass: 'tocbot-is-collapsible',
        collapseDepth: 0,
        scrollSmooth: true,
        headingsOffset: -boardTop
      });
      if ($('.toc-list-item').length > 0) {
        $('#toc').css('visibility', 'visible');
      }
    });
  </script>



  <script  src="https://cdn.staticfile.org/typed.js/2.0.11/typed.min.js" ></script>
  <script>
    var typed = new Typed('#subtitle', {
      strings: [
        '  ',
        "Python爬虫入门（一）&nbsp;",
      ],
      cursorChar: "_",
      typeSpeed: 88,
      loop: false,
    });
    typed.stop();
    $(document).ready(function () {
      $(".typed-cursor").addClass("h2");
      typed.start();
    });
  </script>



  <script  src="https://cdn.staticfile.org/anchor-js/4.2.2/anchor.min.js" ></script>
  <script>
    anchors.options = {
      placement: "right",
      visible: "hover",
      
    };
    var el = "h1,h2,h3,h4,h5,h6".split(",");
    var res = [];
    for (item of el) {
      res.push(".markdown-body > " + item)
    }
    anchors.add(res.join(", "))
  </script>



  <script  src="/js/local-search.js" ></script>
  <script>
    var path = "/local-search.xml";
    var inputArea = document.querySelector("#local-search-input");
    inputArea.onclick = function () {
      searchFunc(path, 'local-search-input', 'local-search-result');
      this.onclick = null
    }
  </script>



  <script  src="https://cdn.staticfile.org/fancybox/3.5.7/jquery.fancybox.min.js" ></script>
  <link  rel="stylesheet" href="https://cdn.staticfile.org/fancybox/3.5.7/jquery.fancybox.min.css" />

  <script>
    $('#post img:not(.no-zoom img, img[no-zoom]), img[zoom]').each(
      function () {
        var element = document.createElement('a');
        $(element).attr('data-fancybox', 'images');
        $(element).attr('href', $(this).attr('src'));
        $(this).wrap(element);
      }
    );
  </script>







  
  
    <script type="text/javascript">
      //定义获取词语下标
      var a_idx = 0;
      jQuery(document).ready(function ($) {
        //点击body时触发事件
        $("body").click(function (e) {
          //需要显示的词语
          var a = new Array("富强", "民主", "文明", "和谐", "自由", "平等", "公正", "法治", "爱国", "敬业", "诚信", "友善");
          //设置词语给span标签
          var $i = $("<span/>").text(a[a_idx]);
          //下标等于原来下标+1  余 词语总数
          a_idx = (a_idx + 1) % a.length;
          //获取鼠标指针的位置，分别相对于文档的左和右边缘。
          //获取x和y的指针坐标
          var x = e.pageX, y = e.pageY;
          //在鼠标的指针的位置给$i定义的span标签添加css样式
          $i.css({
            "z-index": 999,
            "top": y - 20,
            "left": x,
            "position": "absolute",
            "font-weight": "bold",
            "color": rand_color()
          });
          // 随机颜色
          function rand_color() {
            return "rgb(" + ~~(255 * Math.random()) + "," + ~~(255 * Math.random()) + "," + ~~(255 * Math.random()) + ")"
          }
          //在body添加这个标签
          $("body").append($i);
          //animate() 方法执行 CSS 属性集的自定义动画。
          //该方法通过CSS样式将元素从一个状态改变为另一个状态。CSS属性值是逐渐改变的，这样就可以创建动画效果。
          //详情请看http://www.w3school.com.cn/jquery/effect_animate.asp
          $i.animate({
            //将原来的位置向上移动180
            "top": y - 180,
            "opacity": 0
            //1500动画的速度
          }, 1500, function () {
            //时间到了自动删除
            $i.remove();
          });
        });
      })
      ;
    </script>
  














<script src="https://cdn.jsdelivr.net/npm/live2d-widget@^3.1.3/lib/L2Dwidget.min.js"></script><script>L2Dwidget.init({"pluginRootPath":"live2dw/","pluginJsPath":"lib/","pluginModelPath":"assets/","tagMode":false,"debug":false,"model":{"scale":1,"hHeadPos":0.5,"vHeadPos":0.618,"jsonPath":"/live2dw/assets/tororo.model.json"},"display":{"superSample":2,"position":"right","width":150,"height":300,"hOffset":20,"vOffset":-90},"mobile":{"show":true,"scale":1},"react":{"opacityDefault":0.3,"opacityOnHover":0.3,"opacity":0.95},"log":false});</script></body>
</html>
